{
 "cells": [
  {
   "cell_type": "markdown",
   "id": "67a33665-b829-463b-ac95-e2262cf460fc",
   "metadata": {},
   "source": [
    "## Lab 2 -Pitfalls of retrieval - when simple vector search fails! "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "5a5536f0-651c-40e7-aa15-27ee0cda80b7",
   "metadata": {
    "height": 148
   },
   "outputs": [],
   "source": [
    "from helper_utils import load_chroma, word_wrap\n",
    "from chromadb.utils.embedding_functions import SentenceTransformerEmbeddingFunction\n",
    "\n",
    "embedding_function = SentenceTransformerEmbeddingFunction()\n",
    "\n",
    "chroma_collection = load_chroma(filename='microsoft_annual_report_2022.pdf', collection_name='microsoft_annual_report_2022', embedding_function=embedding_function)\n",
    "chroma_collection.count()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "3748b16d-d4a7-49c3-a48a-57dcfc42acd6",
   "metadata": {
    "height": 131
   },
   "outputs": [],
   "source": [
    "import umap\n",
    "import numpy as np\n",
    "from tqdm import tqdm\n",
    "\n",
    "embeddings = chroma_collection.get(include=['embeddings'])['embeddings']\n",
    "umap_transform = umap.UMAP(random_state=0, transform_seed=0).fit(embeddings)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "a338ec83-6301-41a5-9ab1-e5d583306a3f",
   "metadata": {
    "height": 114
   },
   "outputs": [],
   "source": [
    "def project_embeddings(embeddings, umap_transform):\n",
    "    umap_embeddings = np.empty((len(embeddings),2))\n",
    "    for i, embedding in enumerate(tqdm(embeddings)): \n",
    "        umap_embeddings[i] = umap_transform.transform([embedding])\n",
    "    return umap_embeddings   "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "888a86f8-2fe2-4682-bdaf-c15129ed1a32",
   "metadata": {
    "height": 46
   },
   "outputs": [],
   "source": [
    "projected_dataset_embeddings = project_embeddings(embeddings, umap_transform)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "5665c695-22ea-4264-b1ac-5ba720b6d78b",
   "metadata": {
    "height": 148
   },
   "outputs": [],
   "source": [
    "import matplotlib.pyplot as plt\n",
    "\n",
    "plt.figure()\n",
    "plt.scatter(projected_dataset_embeddings[:, 0], projected_dataset_embeddings[:, 1], s=10)\n",
    "plt.gca().set_aspect('equal', 'datalim')\n",
    "plt.title('Projected Embeddings')\n",
    "plt.axis('off')"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "b6bc10a6-68b0-4086-b2d1-86464b761e32",
   "metadata": {},
   "source": [
    "## Relevancy and Distraction"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "8ba6c8c5-9ce4-44d0-9223-6fdd77871f87",
   "metadata": {
    "height": 182
   },
   "outputs": [],
   "source": [
    "query = \"What is the total revenue?\"\n",
    "\n",
    "results = chroma_collection.query(query_texts=query, n_results=5, include=['documents', 'embeddings'])\n",
    "\n",
    "retrieved_documents = results['documents'][0]\n",
    "\n",
    "for document in results['documents'][0]:\n",
    "    print(word_wrap(document))\n",
    "    print('')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "bfdb54db-a442-423c-b006-c33a257cd7d7",
   "metadata": {
    "height": 131
   },
   "outputs": [],
   "source": [
    "query_embedding = embedding_function([query])[0]\n",
    "retrieved_embeddings = results['embeddings'][0]\n",
    "\n",
    "projected_query_embedding = project_embeddings([query_embedding], umap_transform)\n",
    "projected_retrieved_embeddings = project_embeddings(retrieved_embeddings, umap_transform)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "377a84aa-1d93-4e97-9b2d-d59c46355338",
   "metadata": {
    "height": 182
   },
   "outputs": [],
   "source": [
    "# Plot the projected query and retrieved documents in the embedding space\n",
    "plt.figure()\n",
    "plt.scatter(projected_dataset_embeddings[:, 0], projected_dataset_embeddings[:, 1], s=10, color='gray')\n",
    "plt.scatter(projected_query_embedding[:, 0], projected_query_embedding[:, 1], s=150, marker='X', color='r')\n",
    "plt.scatter(projected_retrieved_embeddings[:, 0], projected_retrieved_embeddings[:, 1], s=100, facecolors='none', edgecolors='g')\n",
    "\n",
    "plt.gca().set_aspect('equal', 'datalim')\n",
    "plt.title(f'{query}')\n",
    "plt.axis('off')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "ba0ed8ca-6640-4c09-9cb3-9de5e7cf46dc",
   "metadata": {
    "height": 165
   },
   "outputs": [],
   "source": [
    "query = \"What is the strategy around artificial intelligence (AI) ?\"\n",
    "results = chroma_collection.query(query_texts=query, n_results=5, include=['documents', 'embeddings'])\n",
    "\n",
    "retrieved_documents = results['documents'][0]\n",
    "\n",
    "for document in results['documents'][0]:\n",
    "    print(word_wrap(document))\n",
    "    print('')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "28bac3a2-0d29-48dc-9b48-2d9313239a25",
   "metadata": {
    "height": 131
   },
   "outputs": [],
   "source": [
    "query_embedding = embedding_function([query])[0]\n",
    "retrieved_embeddings = results['embeddings'][0]\n",
    "\n",
    "projected_query_embedding = project_embeddings([query_embedding], umap_transform)\n",
    "projected_retrieved_embeddings = project_embeddings(retrieved_embeddings, umap_transform)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "db9f2758-0f5a-49e5-b1fa-517b91324575",
   "metadata": {
    "height": 182
   },
   "outputs": [],
   "source": [
    "# Plot the projected query and retrieved documents in the embedding space\n",
    "plt.figure()\n",
    "plt.scatter(projected_dataset_embeddings[:, 0], projected_dataset_embeddings[:, 1], s=10, color='gray')\n",
    "plt.scatter(projected_query_embedding[:, 0], projected_query_embedding[:, 1], s=150, marker='X', color='r')\n",
    "plt.scatter(projected_retrieved_embeddings[:, 0], projected_retrieved_embeddings[:, 1], s=100, facecolors='none', edgecolors='g')\n",
    "\n",
    "plt.gca().set_aspect('equal', 'datalim')\n",
    "plt.title(f'{query}')\n",
    "plt.axis('off')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "aee59493-8a99-4da8-b94f-4747efcfc79d",
   "metadata": {
    "height": 165
   },
   "outputs": [],
   "source": [
    "query = \"What has been the investment in research and development?\"\n",
    "results = chroma_collection.query(query_texts=query, n_results=5, include=['documents', 'embeddings'])\n",
    "\n",
    "retrieved_documents = results['documents'][0]\n",
    "\n",
    "for document in results['documents'][0]:\n",
    "    print(word_wrap(document))\n",
    "    print('')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "f5eda9bc-ae76-4db6-9e0c-ae099d852d78",
   "metadata": {
    "height": 131
   },
   "outputs": [],
   "source": [
    "query_embedding = embedding_function([query])[0]\n",
    "retrieved_embeddings = results['embeddings'][0]\n",
    "\n",
    "projected_query_embedding = project_embeddings([query_embedding], umap_transform)\n",
    "projected_retrieved_embeddings = project_embeddings(retrieved_embeddings, umap_transform)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "b1183e75-4c65-422e-bc47-48010d8b29c9",
   "metadata": {
    "height": 182
   },
   "outputs": [],
   "source": [
    "# Plot the projected query and retrieved documents in the embedding space\n",
    "plt.figure()\n",
    "plt.scatter(projected_dataset_embeddings[:, 0], projected_dataset_embeddings[:, 1], s=10, color='gray')\n",
    "plt.scatter(projected_query_embedding[:, 0], projected_query_embedding[:, 1], s=150, marker='X', color='r')\n",
    "plt.scatter(projected_retrieved_embeddings[:, 0], projected_retrieved_embeddings[:, 1], s=100, facecolors='none', edgecolors='g')\n",
    "\n",
    "plt.gca().set_aspect('equal', 'datalim')\n",
    "plt.title(f'{query}')\n",
    "plt.axis('off')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "5fcd85cc-8898-41ed-a0aa-bd8a33fc565a",
   "metadata": {
    "height": 165
   },
   "outputs": [],
   "source": [
    "query = \"What has Michael Jordan done for us lately?\"\n",
    "results = chroma_collection.query(query_texts=query, n_results=5, include=['documents', 'embeddings'])\n",
    "\n",
    "retrieved_documents = results['documents'][0]\n",
    "\n",
    "for document in results['documents'][0]:\n",
    "    print(word_wrap(document))\n",
    "    print('')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "c65337e9-85ee-47f7-89fd-7fe77cd0e1b2",
   "metadata": {
    "height": 131
   },
   "outputs": [],
   "source": [
    "query_embedding = embedding_function([query])[0]\n",
    "retrieved_embeddings = results['embeddings'][0]\n",
    "\n",
    "projected_query_embedding = project_embeddings([query_embedding], umap_transform)\n",
    "projected_retrieved_embeddings = project_embeddings(retrieved_embeddings, umap_transform)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "a7794092-4195-4cf3-9eab-11c9c05a26b9",
   "metadata": {
    "height": 182
   },
   "outputs": [],
   "source": [
    "# Plot the projected query and retrieved documents in the embedding space\n",
    "plt.figure()\n",
    "plt.scatter(projected_dataset_embeddings[:, 0], projected_dataset_embeddings[:, 1], s=10, color='gray')\n",
    "plt.scatter(projected_query_embedding[:, 0], projected_query_embedding[:, 1], s=150, marker='X', color='r')\n",
    "plt.scatter(projected_retrieved_embeddings[:, 0], projected_retrieved_embeddings[:, 1], s=100, facecolors='none', edgecolors='g')\n",
    "\n",
    "plt.gca().set_aspect('equal', 'datalim')\n",
    "plt.title(f'{query}')\n",
    "plt.axis('off')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "f2cab7a1-1be7-45f0-83b7-543e48f83901",
   "metadata": {
    "height": 29
   },
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "fe0343be-73c9-4aed-83b0-aba09569ac87",
   "metadata": {
    "height": 29
   },
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "6f0f3e33-e517-4f6b-8b38-c47c1e3d40b4",
   "metadata": {
    "height": 29
   },
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "a16fdcb1-57d0-4f04-af8f-7c7fc594d947",
   "metadata": {
    "height": 29
   },
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "babe7893-9cbc-43c5-94ef-cbf8f5d68cf2",
   "metadata": {
    "height": 29
   },
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "60a9524b-1085-4bdf-a161-39f11397dc1f",
   "metadata": {
    "height": 29
   },
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "d189f088-b58e-4583-9590-afdfa624cf87",
   "metadata": {
    "height": 29
   },
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "7b26a01a-4575-446b-b8dc-a8c5ab153172",
   "metadata": {
    "height": 29
   },
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "0950575b-b69d-46a3-8c91-c7af89f5c204",
   "metadata": {
    "height": 29
   },
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "3f123ad8-b2e8-4a25-8b42-a520ecaf566b",
   "metadata": {
    "height": 29
   },
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "83c04587-d1de-419c-a213-2e3eb67dc33d",
   "metadata": {
    "height": 29
   },
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "d3155972-824e-4ebe-a692-2227c113c5a8",
   "metadata": {
    "height": 29
   },
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "d8144a4a-85f6-4800-87f9-36a1b6ceda1f",
   "metadata": {
    "height": 29
   },
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "8ff0b18e-12a0-4ac0-97dd-8618b22e7dbf",
   "metadata": {
    "height": 29
   },
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "03ca7e7c-4b47-4652-9b46-a40b3dffa5e6",
   "metadata": {
    "height": 29
   },
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "f74e7d67-7f51-41c4-8e25-edbaa02d0bd8",
   "metadata": {
    "height": 29
   },
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "9188e886-d406-406f-b234-f5c3353a77a2",
   "metadata": {
    "height": 29
   },
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "3d3bb286-2694-4ed4-8466-46865e997ced",
   "metadata": {
    "height": 29
   },
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "2876084b-4038-4b0c-8ec8-8294a86adfc1",
   "metadata": {
    "height": 29
   },
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "3ac542e1-b094-431f-9611-cf7e36d3f0de",
   "metadata": {
    "height": 29
   },
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "bcd6114b-c09d-4173-a623-9a08aaf63e4b",
   "metadata": {
    "height": 29
   },
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "ad10ab65-b351-4f4b-b7d2-63474acfb9f9",
   "metadata": {
    "height": 29
   },
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "800f3d81-cbdb-4ba4-8d49-85747fdfded8",
   "metadata": {
    "height": 29
   },
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "37847448-c9f6-4f51-bf06-f7809964a8b2",
   "metadata": {
    "height": 29
   },
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "2dcefc87-0964-4b94-946b-2145781ad606",
   "metadata": {
    "height": 29
   },
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "5fc994bc-7b1e-476a-9df9-300a3e374882",
   "metadata": {
    "height": 29
   },
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "9ef5f5d5-acb7-4b0a-93ef-e61306708e69",
   "metadata": {
    "height": 29
   },
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "44e4b33f-d8fb-4f3a-b884-8b43a3766583",
   "metadata": {
    "height": 29
   },
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "c2a480a2-2c29-4a01-80dd-ee41934b7901",
   "metadata": {
    "height": 29
   },
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "8127c2bf-0d15-4b62-b46a-f7a17ad2ec92",
   "metadata": {
    "height": 29
   },
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "18ded129-a637-4269-a116-550fe9a90570",
   "metadata": {
    "height": 29
   },
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "c1d7ee44-7b29-483f-a3f2-cc9d8e18880e",
   "metadata": {
    "height": 29
   },
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "8e450dd8-9719-42c6-8c3c-33cac910e0a5",
   "metadata": {
    "height": 29
   },
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.9.18"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
